In this report, we are going to explore US commercial flights. All the data come from the Research and Innovative Technology Administration (RITA) which coordinates the United States Department of Transporation research programs. It contains 22 years of data (over 123 Millions of observations) and 29 different variables.
The dataset is stored into an SQLite database so it’s needed to create the bridge to retrieve data.
path_db = "/Users/davidemastricci/Desktop/Progetto_Statistica/data/ontime.sqlite3"
ontime <- dbConnect(RSQLite::SQLite(), dbname=path_db)
from_db <- function(sql) {
dbGetQuery(ontime, sql)
}
The aim of this analysis is to answer the following questions:
How is the trend of commercial flights in the US?
Which is the best (worst) day of the week to take a flight?
Which is the best (worst) season to flight in terms of delay?
Before answering those questions let’s explore the dataset and find some little insights.
Those following are useful data for the introduction.
flights <- from_db(" select Year, Month, DayofMonth, DayOfWeek, ArrDelay, UniqueCarrier, TailNum, Origin, Dest, Distance, CancellationCode, Cancelled
from ontime")
########### Number of Aricraft #############
aircrafts <- flights %>%
filter(TailNum != "NA") %>%
group_by(Year) %>%
summarise(NumberOfAircraft = n_distinct(TailNum))
aircrafts_plot <- ggplot(aircrafts, aes(y=NumberOfAircraft, x=Year))+
geom_line()+
ggtitle("Number of aircraft per year")
############ Carrier Distribution #################
carrier <- flights %>%
filter(UniqueCarrier != "NA")
carrier$UniqueCarrier[carrier$UniqueCarrier != "DL" &
carrier$UniqueCarrier != "WN" &
carrier$UniqueCarrier != "AA" &
carrier$UniqueCarrier != "US" &
carrier$UniqueCarrier != "UA" &
carrier$UniqueCarrier != "NW"] <- "Other"
carrier <- carrier %>%
group_by(UniqueCarrier) %>%
summarise(NumberOfIstances = n())
carrier_plot <- ggplot(carrier, aes(x=factor(1), y=NumberOfIstances / sum(NumberOfIstances), fill=as.factor(UniqueCarrier))) +
geom_bar(stat = 'identity')+
coord_polar(theta = "y")+
xlab('')+
ylab('')+
labs(fill="Carrier")+
theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)+
theme(axis.text.x=element_blank())+
geom_text(aes(label = paste0(round((100*carrier$NumberOfIstances / sum(carrier$NumberOfIstances))),"%")), position = position_stack(vjust = 0.5))+
ggtitle("Carrier Distribution")
######### Distance analysis #################
max_distance <- flights %>%
select(one_of(c("Distance", "Origin", "Dest"))) %>%
filter(Distance != 0)
min_distance <- max_distance[max_distance$Distance == min(max_distance$Distance),]
min_distance <- min_distance[1,]
max_distance <- max_distance[max_distance$Distance == max(max_distance$Distance),]
max_distance <- max_distance[1,]
airports <- read.csv("/Users/davidemastricci/Desktop/Progetto_Statistica/data/airports.csv",quote = '""')
max_distance$OriginLat <- airports$lat[airports$iata == max_distance$Origin]
max_distance$OriginLon <- airports$long[airports$iata == max_distance$Origin]
max_distance$DestLat <- airports$lat[airports$iata == max_distance$Dest]
max_distance$DestLon <- airports$long[airports$iata == max_distance$Dest ]
max_distance$OriginState <- airports$state[airports$iata == max_distance$Origin]
max_distance$DestState <- airports$state[airports$iata == max_distance$Dest]
min_distance$OriginLat <- airports$lat[airports$iata == min_distance$Origin]
min_distance$OriginLon <- airports$long[airports$iata == min_distance$Origin]
min_distance$DestLat <- airports$lat[airports$iata == min_distance$Dest]
min_distance$DestLon <- airports$long[airports$iata == min_distance$Dest ]
min_distance$OriginState <- airports$state[airports$iata == min_distance$Origin]
min_distance$DestState <- "WA" # https://it.wikipedia.org/wiki/Fairchild_Air_Force_Base
distance <- max_distance %>%
bind_rows(min_distance)
########### Airports analysis ###########
number_airports <- nrow(airports)
######### Cancellation analysis #########
cancellation <- flights[flights$Cancelled == 1,]
cancellation$CancellationCode[cancellation$CancellationCode == "NA" | cancellation$CancellationCode == ""] <- "E"
cancellation$CancellationCode <- recode(cancellation$CancellationCode,
"A"="Carrier",
"B"="Weather",
"C"="NAS",
"D"="Security",
"E"="Unknown")
cancellation_plot <- ggplot(data = cancellation, aes(x=CancellationCode))+
geom_bar(aes(y = (..count..)/sum(..count..)))+
scale_y_continuous(labels=percent)+
ggtitle("Causes of cancellations")+
ylab("% of Cancellation")
distance;
ggarrange(aircrafts_plot, carrier_plot, cancellation_plot, ncol = 2, nrow=2);
The first table showed above describe the maximum\minimum distance travelled by aircraft.
The first row (max) shows the route from John F. Kennedy International (JFK), state of New York (NY), to Honolulu International (HNL), state of Hawaii (HI). The length of this route is 4983 miles;
The second row (min) shows the route from Spokane International (GEG), state of Washington (WA), to Fairchild AFB (SKA), also in the state of Washington (WA). The length of this route is 6 miles.
Those are just 4 of the 3376 present in the dataset.
The set of 3 charts showed above help us to summarize some aspects of the dataset like the:
To measure the trend of US commercial flights in the US we are going to use the number of flights per year.
no_cancelled_flights <- from_db(" select Year, Month, DayofMonth, DayOfWeek, ArrDelay
from ontime
where Cancelled <> 1 and Year > 1987 ") #1987 has only three months recorded
# Adjusting the type of some variables
no_cancelled_flights$Month <- as.factor(no_cancelled_flights$Month)
no_cancelled_flights$DayOfWeek <- as.factor(no_cancelled_flights$DayOfWeek)
no_cancelled_flights$DayOfWeek <- recode(no_cancelled_flights$DayOfWeek,
"1"="Monday",
"2"="Tuesday",
"3"="Wednesday",
"4"="Thursday",
"5"="Friday",
"6"="Saturday",
"7"="Sunday")
no_cancelled_flights$Month<- recode(no_cancelled_flights$Month,
"1"= "Jan",
"2"= "Feb",
"3"= "Mar",
"4"= "Apr",
"5"= "May",
"6"= "Jun",
"7"= "Jul",
"8"= "Aug",
"9"= "Sep",
"10"= "Oct",
"11"= "Nov",
"12"= "Dec")
From the plot above we can clearly see a slight increase of the number of flights over the years from 1998 until 2001 and then after a drop in 2002 a rapid growth is observed. After that we can observe two years (2005 - 2006) of flatness followed by a slight growth (2007) and fall (2008). Let’s see if interpolating the points helps us to understand the trend direction of the curve
lm_plot <- ggplot(number_of_flights_analysis, aes(x=Year, y=NumberOfFlights))+
geom_line()+
stat_smooth(method = 'lm', aes(colour = 'linear'), level=.95) +
ggtitle("Flights trend in US")+
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))
loess_plot <- ggplot(number_of_flights_analysis, aes(x=Year, y=NumberOfFlights))+
geom_line()+
stat_smooth(method = 'loess', aes(colour = 'loess'), level = .95, span = .3) +
ggtitle("Flight trend in US")+
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))
ggarrange(lm_plot, loess_plot, nrow = 2, ncol = 1)
Both linear and loess models highlight the tendecy to grow over the time which means that we can expect the number of flights to increse in the future.
We all want to take a flight and have no problem with delay. Those data can help us finding the best day to flight avoiding delays. This is the the amount of delay for each day:
no_cancelled_flights$Month<- recode(no_cancelled_flights$Month,
"Jan" = "1",
"Feb" = "2",
"Mar" = "3",
"Apr"="4",
"May"="5",
"Jun"="6",
"Jul"="7",
"Aug"="8",
"Sep"="9",
"Oct"="10",
"Nov"="11",
"Dec"="12")
no_cancelled_flights <- no_cancelled_flights %>%
mutate(Date = paste0(DayofMonth, "-",Month,"-",Year))
no_cancelled_flights$Date <- as.Date(no_cancelled_flights$Date, "%d-%m-%Y")
late_flight <- no_cancelled_flights %>%
filter(ArrDelay > 0)
delay_analysis <- late_flight %>%
group_by(Date) %>%
summarise(NumberOfFlights = n())
temp <- no_cancelled_flights %>%
group_by(Date) %>%
summarise(NumberOfFlights = n())
delay_analysis <- full_join(temp, delay_analysis, by="Date")
delay_analysis$PercLateFlight <- (delay_analysis$NumberOfFlights.y / delay_analysis$NumberOfFlights.x)
# Days with no delayed flight will have NA as a value of column PercLateFlight so here we subistute with 0 which
# means 0% of delayed flights
delay_analysis$PercLateFlight[is.na(delay_analysis$PercLateFlight)] <- 0
moving_average_30 <- rollmean(delay_analysis$PercLateFlight, 30)
min_ma <- min(moving_average_30)
max_ma <- max(moving_average_30)
delay_analysis %>%
ggplot(aes(x=Date, y=PercLateFlight))+
geom_line()+
geom_hline(yintercept = min_ma, color='red')+
geom_hline(yintercept = max_ma, color='red')+
geom_hline(yintercept = .5, color='green')+
geom_ma(aes(color="MA 30"),ma_fun = SMA, n=30) +
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))+
labs(y="% of delayed flights", x='Timeline')
If we look at the moving average [red dashed line] we can cleary see that the frequency of delayed flights is moving between 0.2861154 and 0.6568554 [red lines]. This means we can expect, for each day, to have a number of delayed flights ranging from 29 to 66 percent. This analysis is too general, let’s see if we can go into more detail and understand if there are days of the week in which this phenomenon is more\less present.
dayOfWeek_analysis <- no_cancelled_flights %>%
group_by(Year,DayOfWeek) %>%
summarise(TotalDelay= sum(ArrDelay)) # create data frame grouped by year and the number of flight for each year
ggplot(dayOfWeek_analysis, aes(x=Year, y=TotalDelay, color=DayOfWeek))+
geom_line()+
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))+
labs(color = "Day of Week")
Considering only the total amount of delay for each year it’s clear that Saturday is the best day to take a flight because in that day the amount of accumulated delay is less than al the other days. Let’s see if this situation still stays if we consider the percentage of delay made day by day all the years in our dataset.
Even in this analysis Saturday is the best day of the week to fly to minimize delay. In fact, the average percentage of delay accumulated every day is moving between 0.3052416 and 0.5160661, which is the lowest range among all the 7 days. Follow Sunday with an average falling into 0.3519014 and 0.5342165. The worst days are Friday and Thursday with averages falling respectively in [0.4056204 ; 0.621423] and [0.3938655; 0.6306774]
The same discourse of the delays also applies to the seasons, in fact, it would be useful to know in what season to book the holidays to minimize delays. Before going ahead we should create the variable that will contain the seasons, which will follow the following rules:
Summer includes the months of June, July and August;
Winter includes the months of December, January and February;
Fall includes the months of September, October and November;
Spring includes the months of March, April and May.
no_cancelled_flights$Month <- as.factor(no_cancelled_flights$Month)
no_cancelled_flights$Month<- recode(no_cancelled_flights$Month,
"1"= "Jan",
"2"= "Feb",
"3"= "Mar",
"4"= "Apr",
"5"= "May",
"6"= "Jun",
"7"= "Jul",
"8"= "Aug",
"9"= "Sep",
"10"= "Oct",
"11"= "Nov",
"12"= "Dec")
no_cancelled_flights$Season <- no_cancelled_flights$Month
no_cancelled_flights$Season<- recode(no_cancelled_flights$Season,
"Jan" = "Winter",
"Feb" = "Winter",
"Mar" = "Spring",
"Apr" = "Spring",
"May" = "Spring",
"Jun" = "Summer",
"Jul" = "Summer",
"Aug" = "Summer",
"Sep" = "Fall",
"Oct" = "Fall",
"Nov" = "Fall",
"Dec" = "Winter")
season_analysis <- no_cancelled_flights %>%
group_by(Year,Season) %>%
summarise(TotalDelay= sum(ArrDelay)) # create data frame grouped by year and the number of flight for each year
season_plot <- ggplot(season_analysis, aes(x=Year, y=TotalDelay, color=Season))+
geom_line()+
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))+
labs(color = "Season")
season_plot
Looking at the above plot, which considers the total amount of delay accumulated each year, we can say that the best season to flight is Fall followed by:
Spring
Winter
Summer
Again let’s have a close look at the percentage of delayed flights over the total number of flights day by day for all the years inside our dataset.
late_flight <- no_cancelled_flights %>%
filter(ArrDelay > 0)
delay_analysis_season <- late_flight %>%
group_by(Date, Season) %>%
summarise(NumberOfFlights = n())
temp <- no_cancelled_flights %>%
group_by(Date, Season) %>%
summarise(NumberOfFlights = n())
delay_analysis_season <- full_join(temp, delay_analysis_season, by="Date")
delay_analysis_season$PercLateFlight <- (delay_analysis_season$NumberOfFlights.y / delay_analysis_season$NumberOfFlights.x)
# Days with no delayed flight will have NA as a value of column PercLateFlight so here we subistute with 0 which
# means 0% of delayed flights
delay_analysis_season$PercLateFlight[is.na(delay_analysis_season$PercLateFlight)] <- 0
winter <- delay_analysis_season[delay_analysis_season$Season.x == "Winter",]
fall <- delay_analysis_season[delay_analysis_season$Season.x == "Fall",]
summer <- delay_analysis_season[delay_analysis_season$Season.x == "Summer",]
spring <- delay_analysis_season[delay_analysis_season$Season.x == "Spring", ]
############# WINTER PLOT #####################
moving_average_30 <- rollmean(winter$PercLateFlight, 30)
min_ma <- min(moving_average_30)
max_ma <- max(moving_average_30)
winter_plot <- winter %>%
ggplot(aes(x=Date, y=PercLateFlight))+
geom_line()+
geom_hline(yintercept = min_ma, color='red')+
geom_hline(yintercept = max_ma, color='red')+
geom_hline(yintercept = .5, color='green')+
geom_ma(aes(color="MA 30"),ma_fun = SMA, n=30) +
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))+
labs(y="% of delayed flights", x='Timeline')+
ggtitle("WINTER")
############# FALL PLOT #####################
moving_average_30 <- rollmean(fall$PercLateFlight, 30)
min_ma <- min(moving_average_30)
max_ma <- max(moving_average_30)
fall_plot <- fall %>%
ggplot(aes(x=Date, y=PercLateFlight))+
geom_line()+
geom_hline(yintercept = min_ma, color='red')+
geom_hline(yintercept = max_ma, color='red')+
geom_hline(yintercept = .5, color='green')+
geom_ma(aes(color="MA 30"),ma_fun = SMA, n=30) +
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))+
labs(y="% of delayed flights", x='Timeline')+
ggtitle("FALL")
############# SUMMER PLOT #####################
moving_average_30 <- rollmean(summer$PercLateFlight, 30)
min_ma <- min(moving_average_30)
max_ma <- max(moving_average_30)
summer_plot <- summer %>%
ggplot(aes(x=Date, y=PercLateFlight))+
geom_line()+
geom_hline(yintercept = min_ma, color='red')+
geom_hline(yintercept = max_ma, color='red')+
geom_hline(yintercept = .5, color='green')+
geom_ma(aes(color="MA 30"),ma_fun = SMA, n=30) +
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))+
labs(y="% of delayed flights", x='Timeline')+
ggtitle("SUMMER")
############# SPRING PLOT #####################
moving_average_30 <- rollmean(spring$PercLateFlight, 30)
min_ma <- min(moving_average_30)
max_ma <- max(moving_average_30)
spring_plot <- spring %>%
ggplot(aes(x=Date, y=PercLateFlight))+
geom_line()+
geom_hline(yintercept = min_ma, color='red')+
geom_hline(yintercept = max_ma, color='red')+
geom_hline(yintercept = .5, color='green')+
geom_ma(aes(color="MA 30"),ma_fun = SMA, n=30) +
scale_y_continuous(labels = scales::format_format(scientific= FALSE, big.mark=".", decimal.mark=","))+
labs(y="% of delayed flights", x='Timeline')+
ggtitle("SPRING")
winter_ma <- rollmean(winter$PercLateFlight, 30)
winter_ma.max <- max(winter_ma)
winter_ma.min <- min(winter_ma)
fall_ma <- rollmean(fall$PercLateFlight, 30)
fall_ma.max <- max(fall_ma)
fall_ma.min <- min(fall_ma)
summer_ma <- rollmean(summer$PercLateFlight, 30)
summer_ma.max <- max(summer_ma)
summer_ma.min <- min(summer_ma)
spring_ma <- rollmean(spring$PercLateFlight, 30)
spring_ma.max <- max(spring_ma)
spring_ma.min <- min(spring_ma)
winter_plot;
fall_plot;
summer_plot;
spring_plot
The four plot above shows us that Fall leads the group, as the best season to flight to minimize delays, with an average of delayed flights that falls in [0.287182; 0.5681095]. The ranking continues as:
Spring with average falling in [0.3086707; 0.6093402];
Summer and Winter with averages falling respectively in [0.3618468; 0.6279626] and [0.334398; 0.6568554]